TL;DR

Here, we explore the TH2 dataset, to try and understand if it’s only the Fluidigm data that is somewhat off.

The tricky part of this analysis is that we don’t know the ground truth for this dataset, so it’s not easy to understand which representaion is better. However, we do not see the issues with W seen in the Fluidigm data, suggesting that that is a problem specific to that dataset.

Data

As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were not doublets.

Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("th2")
th2_core <- th2[grep("^ERCC-", rownames(th2), invert = TRUE),
                    which(colData(th2)$single=="OK" & colData(th2)$Buettner_filter=="PASS")]

filter <- apply(assay(th2_core)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 9365

To speed up the computations, I will focus on the top 1,000 most variable genes.

th2_core <- th2_core[filter,]
core <- assay(th2_core)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

detection_rate <- colSums(core>0)
coverage <- colSums(core)
gata3 <- log2(assay(th2_core)["Gata3",])

pca <- prcomp(t(log1p(core)))
data.frame(pca$x) %>% ggplot(aes(PC1, PC2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##                         PC1          PC2 detection_rate   coverage
## PC1             1.000000000 -0.002629017     0.95570040  0.5829114
## PC2            -0.002629017  1.000000000    -0.04921096 -0.2084226
## detection_rate  0.955700402 -0.049210957     1.00000000  0.4791100
## coverage        0.582911392 -0.208422590     0.47910999  1.0000000

V intercept in both Mu and Pi

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
##    user  system elapsed 
## 103.427  22.829  47.788

Plot the results with cells colored according to Gata3, mimicking the figure from the original paper.

data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

Explore Gamma estimates

One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.

#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##                         W1           W2    gamma_mu    gamma_pi
## W1              1.00000000  0.020886076 -0.48763389  0.14491237
## W2              0.02088608  1.000000000  0.05333496 -0.06962025
## gamma_mu       -0.48763389  0.053334956  1.00000000 -0.10932327
## gamma_pi        0.14491237 -0.069620253 -0.10932327  1.00000000
## detection_rate -0.48564630 -0.004187131  0.69492980 -0.44874112
## coverage       -0.17653359  0.118719572  0.83259494 -0.28188900
##                detection_rate   coverage
## W1               -0.485646297 -0.1765336
## W2               -0.004187131  0.1187196
## gamma_mu          0.694929799  0.8325949
## gamma_pi         -0.448741123 -0.2818890
## detection_rate    1.000000000  0.4791100
## coverage          0.479109992  1.0000000

\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.

par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vall)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)

plot(rowMeans(log1p(getMu(zinb_Vall))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vall@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vall@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vall@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vall@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

No V intercept

print(system.time(zinb_Vnone <- zinbFit(core, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(core)))))
##    user  system elapsed 
## 104.388  21.766  47.100
data.frame(W1=zinb_Vnone@W[,1], W2=zinb_Vnone@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

df <- data.frame(W1 = zinb_Vnone@W[,1], W2 = zinb_Vnone@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##                        W1         W2 detection_rate   coverage
## W1              1.0000000  0.1125365     -0.4053970 -0.1227605
## W2              0.1125365  1.0000000     -0.6382453 -0.8917235
## detection_rate -0.4053970 -0.6382453      1.0000000  0.4791100
## coverage       -0.1227605 -0.8917235      0.4791100  1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vnone)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)

plot(rowMeans(log1p(getMu(zinb_Vnone))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vnone@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vnone@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vnone@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vnone@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Mu

V <- cbind(rep(0, NROW(core)), rep(1, NROW(core)))

print(system.time(zinb_Vmu <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
##    user  system elapsed 
## 128.814  26.974  58.236
data.frame(W1=zinb_Vmu@W[,1], W2=zinb_Vmu@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

df <- data.frame(W1 = zinb_Vmu@W[,1], W2 = zinb_Vmu@W[,2], gamma_mu = zinb_Vmu@gamma_mu[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu detection_rate
## W1              1.00000000  0.08210808 -0.63936222    -0.65269334
## W2              0.08210808  1.00000000 -0.02877313    -0.05977616
## gamma_mu       -0.63936222 -0.02877313  1.00000000     0.71618192
## detection_rate -0.65269334 -0.05977616  0.71618192     1.00000000
## coverage       -0.32921130  0.13899708  0.87127556     0.47910999
##                  coverage
## W1             -0.3292113
## W2              0.1389971
## gamma_mu        0.8712756
## detection_rate  0.4791100
## coverage        1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vmu)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)

plot(rowMeans(log1p(getMu(zinb_Vmu))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vmu@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vmu@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vmu@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vmu@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

V intercept only in Pi

print(system.time(zinb_Vpi <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
##    user  system elapsed 
## 133.406  36.809  63.523
data.frame(W1=zinb_Vpi@W[,1], W2=zinb_Vpi@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

df <- data.frame(W1 = zinb_Vpi@W[,1], W2 = zinb_Vpi@W[,2], gamma_pi = zinb_Vpi@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##                         W1           W2     gamma_pi detection_rate
## W1             1.000000000  0.234104187  0.007327167      0.1082811
## W2             0.234104187  1.000000000  0.007521908     -0.5629865
## gamma_pi       0.007327167  0.007521908  1.000000000     -0.4664147
## detection_rate 0.108281146 -0.562986496 -0.466414709      1.0000000
## coverage       0.594547225 -0.421056475 -0.157205453      0.4791100
##                  coverage
## W1              0.5945472
## W2             -0.4210565
## gamma_pi       -0.1572055
## detection_rate  0.4791100
## coverage        1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vpi)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)

plot(rowMeans(log1p(getMu(zinb_Vpi))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)

It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).

par(mfrow=c(2, 2))
boxplot(zinb_Vpi@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vpi@beta_pi[1,], main="Beta_Pi")

plot(t(zinb_Vpi@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vpi@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

## write matrices to file to feed to ZIFA in python
write.csv(log1p(core), file="logcounts_th2.csv")

Using all genes

core <- assay(th2_core)
dim(core)
## [1] 9365   79
print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 2)))
##    user  system elapsed 
## 671.873  71.356 296.410
data.frame(W1=zinb_all@W[,1], W2=zinb_all@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

Interestingly, when using all the genes, things still work.

ZIFA (top 1000 most variable genes)

Full algorithm

zifa_res <- read.csv("zifa_th2_full.csv", header=FALSE)

data.frame(Z1=zifa_res[,1], Z2=zifa_res[,2]) %>% ggplot(aes(Z1, Z2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()

Block algorithm

zifa_res <- read.csv("zifa_th2.csv", header=FALSE)

data.frame(Z1=zifa_res[,1], Z2=zifa_res[,2]) %>% ggplot(aes(Z1, Z2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()